Predicting how many taxi trips originate in a certain neighborhood has several benefits:
In this project, I have used several open datasets to predict the number of taxi trips in a specific New York neighborhood in a 1 hour time period.
The datasets are the following:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import pandas as pd
import numpy as np
import csv
import sqlite3
import time
import json
from sklearn import cluster
from sklearn import feature_selection, linear_model
from mpl_toolkits.basemap import Basemap
from geopy.distance import vincenty, great_circle
import statsmodels.formula.api as smf
import sklearn
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeRegressor
# set some nicer defaults for matplotlib
from matplotlib import rcParams
#these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
(0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
(0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
(0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
(0.4, 0.6509803921568628, 0.11764705882352941),
(0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
(0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
(0.4, 0.4, 0.4)]
rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = False
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'
As part of exploratory analysis, I initially looked at how taxi trips vary during different months in a year. I follow analyses such as number of total trips at each hour, number of trips as a function of day of the week, and number of trips on each day of the month.
Here we look at the number of trips recorded by NYC Yellow taxis on a monthly basis. Records for 2013, 2014, and 2015 are analyzed. We also look at the number of Uber trips for the months of April to September 2014.
Each count list is created by iterating through all records of the corresponding month. Finally, a dataframe is created that contains all trip count data and this data is loaded into a csv.
It is not necessary to run the next 5 code blocks since the resulting dataframe has been loaded to a new file which can be directly loaded.
def parse_data(file_path, file_name, year, taxi_co, count_df):
'''
Counts the number of records in the files, formats it into a dataframe.
Args:
file_path (str): Path of the file.
file_name (str): Name of the file without month number.
year (int): year of acquired data
taxi_co (str): Name of the taxi company whose data is being parsed
count_df (DataFrame): Existing dataframe to which the data for the year will be added
Returns:
count_df (DataFrame): Dataframe containing the trips data for the year.
'''
count_list = []
for i in range(12):
file_path_iter = file_path + '/' + file_name + '_' + '%d'%year + '-%02d.csv'%(i+1)
print file_path_iter
try:
trip_data_file = open(file_path_iter, 'r')
trip_data = csv.reader(trip_data_file)
header = trip_data.next()
count = 0
for j in trip_data:
count+=1
count_list.append(count)
trip_data_file.close()
except:
count_list.append(0)
count_df['%d'%year + taxi_co + '_trips'] = count_list
return count_df
We create a dataframe containing all the trip count data for 2013, 2014, and 2015 yellow taxi trips.
count_df = pd.DataFrame(columns = ['Month'])
count_df['Month'] = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul','Aug', 'Sep', 'Oct', 'Nov', 'Dec']
new_count_df = count_df
new_count_df = parse_data('../nyc_taxi_files/2013-data','yellow_tripdata',2013,'TLC',new_count_df)
new_count_df = parse_data('../nyc_taxi_files/2014-data','yellow_tripdata',2014,'TLC',new_count_df)
new_count_df = parse_data('../nyc_taxi_files/2015-data','yellow_tripdata',2015,'TLC',new_count_df)
new_count_df
We append the April to September data for Uber to this dataframe. The final dataframe is saved in a csv file and can be loaded in order to run the subsequent analysis
# uber_count_df = pd.DataFrame(columns = ['Month'])
# uber_count_df['Month'] = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul','Aug', 'Sep', 'Oct', 'Nov', 'Dec']
new_count_df = parse_data('../nyc_taxi_files/uber-trip-data','uber-raw-data',2014,'Uber',new_count_df)
new_count_df
new_count_df.to_csv('../nyc_taxi_files/trip_count_all_years.csv',index=False)
new_count_df = pd.read_csv('../nyc_taxi_files/trip_count_all_years.csv')
new_count_df
plt.figure(figsize=(20,10))
ax1 = plt.subplot(121)
count_df.plot(x='Month', y='2013TLC_trips', kind='line', color = 'red', ax=ax1, label='2013 trips')
count_df.plot(x='Month', y='2014TLC_trips', kind='line', color = 'blue', ax=ax1, label='2014 trips')
count_df.plot(x='Month', y='2015TLC_trips', kind='line', color = 'green', ax=ax1, label='2015 trips')
ax1.set(title='Number of trips each month')
ax2 = plt.subplot(122)
count_df[count_df.Month.isin(['Apr','May','Jun','Jul','Aug','Sep'])].plot(x='Month', y='2014Uber_trips', kind='line',
ax=ax2, label='2014 Uber trips')
ax2.set(title='Number of Uber trips from Apr to Sep')
count_df_sorted=count_df.sort_values(by='2013TLC_trips')
count_df_sorted.plot(x='Month',y='2013TLC_trips',kind='bar',ylim=(1.2e7, 1.6e7))
The following code blocks perform two tasks:
The two files contain one-to-one mappings of records corresponding to the same trips and so a plain merge using the primary keys will join the two files.
%%time
trip_data_file=pd.read_csv('../nyc_taxi_files/2013-data/yellow_tripdata_2013-01.csv')
trip_fare_file=pd.read_csv('../nyc_taxi_files/2013-data/trip_fare_01.csv')
trip_data_file.columns
trip_fare_file.columns
trip_fare_file.rename(columns=lambda x: x.strip(), inplace = True)
trip_fare_file.columns
trip_data_file.tail()
trip_fare_file.tail()
I now perform the join on the two files.
Subsequently, I add two more columns to the data which are the hours of pickup and dropoff for a particular trip. The values of both these columns range from 0 to 23.
%%time
trip_full_df=pd.DataFrame.merge(trip_data_file, trip_fare_file,
on=['medallion','hack_license','vendor_id','pickup_datetime'])
%%time
trip_full_df['pickup_hour']=trip_full_df['pickup_datetime'].apply(lambda x: time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_hour)
trip_full_df['dropoff_hour']=trip_full_df['dropoff_datetime'].apply(lambda x: time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_hour)
trip_full_df.shape
trip_full_df.head()
I now perform some exploratory analysis and visualizations. I look at how number of trips vary by hour of day, day of week, and, at a more granular level, for each day of the month.
I also look at how many trips an individual driver makes on a given month. Individual drivers can be identified using the medallion number
%%time
# The fare amount is converted to a float datatype
# An additional tip fraction column is added for analysis of tips which will be performed later
trip_full_df['fare_amount'] = trip_full_df['fare_amount'].astype(float)
trip_full_df['tip_fraction'] = trip_full_df['tip_amount']/trip_full_df['fare_amount']
# By grouping by the pickup hour, and then plotting a histogram with 24 bins, we can look at the distribution of trips on an hourly basis.
trip_full_df.pickup_hour.groupby(trip_full_df.pickup_hour.values).size().plot(kind='bar')
%%time
# Adding a new column for the day of the week. I do this by first extracting the wday number and then mapping each day number
# to a day of the week
trip_full_df['pickup_wday'] = trip_full_df['pickup_datetime'].apply(lambda x: time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_wday)
trip_full_df['pickup_day'] = trip_full_df['pickup_wday'].map({0:'Monday', 1:'Tuesday',2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'})
In order to look at the number of trips for each week day, I group by pickup_day column and then count the number of trips. It should be noted however that this bar chart does not give an accurate picture of the number of trips as a function of the day of the week, since in a given month some days might appear 4 times while some others appear 5 times.
trip_full_df['pickup_day'].groupby(trip_full_df.pickup_day.values).size().plot(kind='bar')
%%time
# I extract the trip date from the pickup_datetime column and assign it to a new column
trip_full_df['full_date'] = trip_full_df['pickup_datetime'].apply(lambda x: x.split(" ")[0])
%%time
# I now introduce a new column for day of the month and look at the number of trips for each day.
trip_full_df['month_day'] = trip_full_df['pickup_datetime'].apply(lambda x: time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mday)
trip_full_df['month_day'].groupby(trip_full_df.month_day.values).size().plot(kind='bar')
# trip_pickup_date.groupby(trip_pickup_date).size().plot(kind='bar')
trip_full_df.head()
%%time
# Looking at the distribution of trips on a particular day. I choose Jan 21 since it had the least number of trips.
tripdata_jan21 = trip_full_df[trip_full_df.full_date == '2013-01-21']
tripdata_jan21['pickup_hour'].groupby(tripdata_jan21['pickup_hour'].values).size().plot(kind='bar')
%%time
# Comparing trips on Jan 13 (Sunday) with Jan 16 (Wednesday)
tripdata_jan13 = trip_full_df[trip_full_df.full_date == '2013-01-13']
tripdata_jan16 = trip_full_df[trip_full_df.full_date == '2013-01-16']
tripdata_jan13['pickup_hour'].groupby(tripdata_jan13['pickup_hour'].values).size().plot(kind='bar')
tripdata_jan16['pickup_hour'].groupby(tripdata_jan16['pickup_hour'].values).size().plot(kind='bar')
As we see, Jan 13 which is a Sunday has a very different pattern than Jan 16.
I also wanted to look at how many trips an individual driver typically partakes. In order to look at this, I looked at the distribution of the trips after grouping by the medallion number.
trip_full_df.groupby(['medallion']).size().hist(bins=50)
trip_full_df.describe()
Few things we notice. The minimum distance for a trip is 0. Any trip which has a distance of 0 should be filtered during analysis and modeling. Also, the latitude and longitude columns for dropoff have some NaN values. The latitude and longitude also have large numbers that make no sense (e.g the minimum value for pickup longitude is -2771).
%%time
trip_df_nonzero = trip_full_df[trip_full_df.trip_distance.astype(float) != 0]
print trip_df_nonzero.shape
trip_df_nonzero.head()
From a histogram of all latitude and longitude values (shown later as part of visualization), it was observed that most taxi trips take place between the latitudes of 40.7 and 40.8 and longitudes of -74.05 and -73.8
%%time
# Converting all location data to floats for easier analysis
trip_df_nonzero[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']] = \
trip_df_nonzero[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']].astype(float)
# filtering the records belonging to relevant latitudes and longitudes
trips_with_location = trip_df_nonzero[(trip_df_nonzero.pickup_latitude >= 40.7) & \
(trip_df_nonzero.pickup_latitude <= 40.8) &\
(trip_df_nonzero.pickup_longitude >= -74.05) & (trip_df_nonzero.pickup_longitude <= -73.8)]
print trips_with_location.shape
trips_with_location.head()
We extract a sample from the overall dataset. Later, we will perform some feature engineering on this subset as an experimental step before running the feature engineering algorithm on the entire dataset. Additionally, from this sample, we filter only those trips that are within those latitudes and longitudes where a large number of taxi trips take place.
The sample data has been stored in a separate file so that the sampling code does not have to be run each time.
%%time
sample_trips_with_location = trips_with_location.sample(n=500000, random_state=1)
sample_trips_with_location.head()
In order to make sure that the sample data is fairly representative of the overall dataset, I looked at the trip count distribution for each one-hour slot.
sample_trips_with_location['pickup_hour'].groupby(sample_trips_with_location['pickup_hour'].values).size().plot(kind='bar')
%%time
sample_trips_with_location.to_csv('../nyc_taxi_files/2013_Jan_samples.csv',index=False)
%%time
sample_trips_with_location = pd.read_csv('../nyc_taxi_files/2013_Jan_samples.csv')
sample_trips_with_location.head()
sample_trips_with_location.shape
We read in the neighborhood dataset available in Open NYC Data and extract the latitude and longitude values of each. We will later use this dataset to add another column in the taxi data.
%%time
neighborhood_df = pd.read_csv('https://data.cityofnewyork.us/api/views/xyye-rtrs/rows.csv')
neighborhood_df.tail()
I extract the latitude and longitude information from the 'the_geom' parameter in the neighborhood dataset. These values are stored in two new columns.
%%time
neighborhood_df['longitude']=neighborhood_df['the_geom'].apply(lambda x: (x[7: -1]).split(' ')[0])
neighborhood_df['latitude']=neighborhood_df['the_geom'].apply(lambda x: (x[7: -1]).split(' ')[1])
# Change the datatype of the latitude and longitude columns to float
neighborhood_df['longitude'] = neighborhood_df['longitude'].astype(float)
neighborhood_df['latitude'] = neighborhood_df['latitude'].astype(float)
neighborhood_df.head()
I continue working with the sample dataset extracted from the overall dataset earlier. I now combine this sample dataset with the neighborhoods data so that each trip is assigned a neighborhood.
Seen below are the various neighborhoods from the dataset. In order to assign a particular neighborhood to each taxi trip, we would have to iterate through all the neighborhoods to find which one is the closest to a particular pickup point.
plt.scatter(x = neighborhood_df['longitude'], y = neighborhood_df['latitude'])
Instead of iterating through the entire neighborhood dataset while trying to assign a neighborhood to each taxi trip, we can instead use K-means clustering to split the dataset into 5 clusters. Subsequently, we can determine the cluster to which each of the pickup point from the taxi trip dataset belongs to, and then search only within that cluster for the exact neighborhood. This way, we can reduce the processing time by almost a factor of 5.
A possible downside for this approach is that it is possible that we have a cluster centroid lying close to the pickup latitude and longitude but the actual neighborhood closest to the pickup point lies at the border of a different cluster. For now, we ignore these border cases.
centroids = []
cls = cluster.k_means(neighborhood_df[['longitude', 'latitude']].values, 5)
centroids.append(cls[0])
neighborhood_df['clusters'] = cls[1] # A new column is added to the neighborhoods dataset
print cls
neighborhood_df.tail()
plt.scatter(x = neighborhood_df['longitude'], y = neighborhood_df['latitude'], c=neighborhood_df['clusters'], s=15, cmap='plasma_r')
I wanted to check if there are some neighborhoods that share their names with others i.e. whether we have more than 1 neighborhoods with the same names.
neighborhood_df_sorted = neighborhood_df.sort('Name')
neighborhood_df_sorted.head()
neighborhood_df_sorted.reset_index(drop=True, inplace = True)
neighborhood_df_sorted.head()
neighborhood_grouped = neighborhood_df_sorted.groupby('Name').count()[['Borough']]
neighborhood_grouped[neighborhood_grouped.Borough > 1]
4 of the neighborhoods appear in 2 different boroughs. Let's take a look at these.
neighborhood_df[neighborhood_df.Name.isin(['Bay Terrace','Chelsea','Murray Hill','Sunnyside'])]
As we see, Murray Hill belongs to Manhattan as well as Queens, Chelsea belongs to Manhattan as well as Staten Island, and so on. These will be handled later on after we add the neighborhoods data to the trip data.
A Dataframe with information on the centroids is defined below. This will be used to find out which centroid lies nearest to a given pickup point. Only neighborhoods of that particular cluster will be searched when determining the neighborhood of a pickup point.
centroids_df = pd.DataFrame(data=centroids[0], columns=('centroid_lon','centroid_lat'))
centroids_df['cluster'] = centroids_df.index.astype(int)
centroids_df
The following method takes as argument the latitude and longitude information and the cluster index. It returns the neighborhood from the specified cluster and is closest to the given latitude and longitude.
def search_cluster(lat,lon,cluster_ind):
'''
Determines the New York neighborhood that is most likely to contain a location identified from its latitude and longitude.
Args:
lat (float): Latitude of the required location.
lon (float): Longitude of the required location.
cluster_ind (int): Index of the cluster of neighborhoods within which the location lies.
There are a total of 5 clusters into which all the neighborhoods have been classified.
Returns:
neighborhood_name (str): Name of the neighborhood that is most likely to contain the given location.
'''
cluster_df = neighborhood_df[neighborhood_df['clusters'] == cluster_ind]
dist_array = {}
for index2, neighborhood in cluster_df.iterrows():
miles = np.sqrt((lat - neighborhood['latitude'])**2 + (lon - neighborhood['longitude'])**2)
dist_array[miles] = neighborhood['Name']
return dist_array[min(dist_array)]
Here, I iterate through all the rows in the sample trip dataset and determine which cluster it belongs to. I then call the method search_cluster to determine which neighborhood the pickup point belongs to.
The result is stored as an array of neighborhood names.
Note that this code block takes more than 1 hour to execute. If you don't want to execute this, you can move on and instead pull the new modified dataset that already contains the neighborhood data and has been saved.
%%time
neighborhood_array=[]
for index1, taxi_row in sample_trips_with_location.iterrows():
dist_min = 100
cluster_ind = 6
for index2, centroid in centroids_df.iterrows():
dist = np.sqrt((taxi_row['pickup_latitude'] - centroid['centroid_lat'])**2 + (taxi_row['pickup_longitude'] - centroid['centroid_lon'])**2)
if dist < dist_min:
dist_min = dist
cluster_ind = centroid['cluster'].astype(int)
nearest_nbrhd = search_cluster(taxi_row['pickup_latitude'],taxi_row['pickup_longitude'],cluster_ind)
neighborhood_array.append(nearest_nbrhd)
print neighborhood_array[0:5]
len(neighborhood_array)
I insert the neighborhood array into a new column of the trip sample data.
sample_trips_with_location['neighborhood'] = neighborhood_array
sample_trips_with_location.shape
sample_trips_with_location.head()
This dataframe with the neighborhood data is saved in a csv and can be pulled later. This ensures that the cluster search does not have to be executed each time we need this dataset.
%%time
sample_trips_with_location.to_csv('../nyc_taxi_files/2013_Jan_sample_neighborhood.csv',index=False)
%%time
sample_trips_with_location = pd.read_csv('../nyc_taxi_files/2013_Jan_sample_neighborhood.csv')
sample_trips_with_location.head()
I now proceed to look at the number of trips on a particular day in a given neighborhood and in a 1-hour slot. For this, we need to group by ['full_date','pickup_hour','neighborhood'].
sample_trips_with_location.groupby(['full_date','pickup_hour','neighborhood']).count()
I define a new dataframe that contains the date, pickup hour, pickup day, neighborhood, number of trips, and month day.
trip_count_df = pd.DataFrame(columns = ['full_date','pickup_hour','pickup_day','neighborhood','month_day','no_of_trips'])
%%time
# All columns except the number of trips are assigned using the .first() corresponding to that column in a particular group
trip_count_df['full_date'] = sample_trips_with_location.groupby(['full_date','pickup_hour','neighborhood'])['full_date'].first().values
trip_count_df['pickup_hour'] = sample_trips_with_location.groupby(['full_date','pickup_hour','neighborhood'])['pickup_hour'].first().values
trip_count_df['neighborhood'] = sample_trips_with_location.groupby(['full_date','pickup_hour','neighborhood'])['neighborhood'].first().values
trip_count_df['pickup_day'] = sample_trips_with_location.groupby(['full_date','pickup_hour','neighborhood'])['pickup_day'].first().values
trip_count_df['month_day'] = sample_trips_with_location.groupby(['full_date','pickup_hour','neighborhood'])['month_day'].first().values
# The number of trips is determined using the .count() method
trip_count_df['no_of_trips'] = sample_trips_with_location.groupby(['full_date','pickup_hour','neighborhood'])['neighborhood'].count().values
trip_count_df.head()
trip_count_df.shape
print trip_count_df['neighborhood'].unique().size
print neighborhood_df['Name'].unique().size
I remove the data corresponding to 'Bay Terrace','Chelsea','Murray Hill', and 'Sunnyside' neighborhoods. These are the neighborhoods that appear more than once.
%%time
trip_count_df = trip_count_df[~(trip_count_df.neighborhood.isin(['Bay Terrace','Chelsea','Murray Hill','Sunnyside']))]
trip_count_df.shape
In order to include the details of the neighborhood data such as latitude, longitude, and Borough that it belongs to by joining the trip_count_df dataframe with the neighborhood_df dataframe.
%%time
trip_count_with_neighborhood = trip_count_df.set_index('neighborhood').join(neighborhood_df.set_index('Name'))
trip_count_with_neighborhood.shape
trip_count_with_neighborhood.head()
I want to retain only some of the important columns of this dataset. This columns are extracted as shown below.
%%time
trip_count_with_neighborhood = trip_count_with_neighborhood[['full_date','pickup_hour','pickup_day','month_day','Borough','longitude','latitude','no_of_trips']]
trip_count_with_neighborhood['neighborhood'] = trip_count_with_neighborhood.index.values
trip_count_with_neighborhood.reset_index(drop=True,inplace=True)
trip_count_with_neighborhood.head()
trip_count_with_neighborhood.shape
print trip_count_with_neighborhood.Borough.unique()
print trip_count_with_neighborhood.neighborhood.unique().size
%%time
trip_count_with_neighborhood.to_csv('../nyc_taxi_files/2013_Jan_sample_tripcount.csv',index = False)
I use statsmodels and scikit-learn in order to build a predictive model for the no of taxi trips in a given neighborhood for a given 1 hour time period.
%%time
trip_count_with_neighborhood = pd.read_csv('../nyc_taxi_files/2013_Jan_sample_tripcount.csv')
trip_count_with_neighborhood.head()
I progressively add the predictors to a ordinary least square linear model and observe the R-squared value. Subsequently, when I fit a scikit-learn to the same data, I will look at the mean square error.
lm = smf.ols(formula='no_of_trips ~ neighborhood', data=trip_count_with_neighborhood).fit() #using neighborhood as the sole predictor
lm.summary()
We observe that the R-squared value is decent.
Some neighborhoods have lower p-value than others. I wanted to compare two of those neighborhoods with two that have higher p-value.
trip_count_with_neighborhood[trip_count_with_neighborhood.neighborhood == 'Battery Park City'] #p-value = 0
trip_count_with_neighborhood[trip_count_with_neighborhood.neighborhood == 'Astoria Heights'] #p-value = 0.9
print trip_count_with_neighborhood[trip_count_with_neighborhood.neighborhood == 'Carnegie Hill'].shape #p-value = 0
print trip_count_with_neighborhood[trip_count_with_neighborhood.neighborhood == 'Bushwick'].shape #p-value = 0.706
It is possible that neighborhoods with lower number of entries have a higher p-value.
Next, I create dummy variables for the pickup hour. These dummy variables will be additional predictors for the trip count.
hour = pd.get_dummies(trip_count_with_neighborhood.pickup_hour, prefix = 'hour')
trip_count_with_neighborhood = trip_count_with_neighborhood.join(hour[['hour_0','hour_1','hour_2','hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8','hour_9',
'hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17',
'hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']])
sns.heatmap(trip_count_with_neighborhood.drop(['full_date','pickup_hour'], axis=1).corr())
We notice a correlation between latitude, longitude and the no of trips; also, there is light correlation between the dummy hour variables and the no of trips.
I now perform an OLS fit using neighborhood and the pickup hour as predictors. We observe that the R-squared value has a slight increase.
lm = smf.ols(formula='no_of_trips ~ neighborhood + hour_0 + hour_1 + hour_2 + hour_3 + hour_4 + hour_5 + \
hour_6 + hour_7 + hour_8 + hour_9 + hour_10 + hour_11 + hour_12 + hour_13 + hour_14 + hour_15 + \
hour_16 + hour_17 + hour_18 + hour_19 + hour_20 + hour_21 + hour_22 + hour_23', data=trip_count_with_neighborhood).fit()
lm.summary()
Next I perform an OLS with neighborhood, pickup hour, and the day of the week.
We observe only a slight increase beyond what we had seen earlier. It is likely that one of the reasons the pickup day is not a significant predictor is the fact that all weekdays have similar patterns. Perhaps a column that classifies whether the day is a weekday or weekend, can act as a better predictor.
lm = smf.ols(formula='no_of_trips ~ neighborhood + pickup_day + hour_0 + hour_1 + hour_2 + hour_3 + hour_4 + hour_5 + \
hour_6 + hour_7 + hour_8 + hour_9 + hour_10 + hour_11 + hour_12 + hour_13 + hour_14 + hour_15 + \
hour_16 + hour_17 + hour_18 + hour_19 + hour_20 + hour_21 + hour_22 + hour_23', data=trip_count_with_neighborhood).fit()
lm.summary()
I now introduce dummy variables for the neighborhoods. This is required if we want to use a linear model fit in scikit learn.
neighborhood_dummies = pd.get_dummies(trip_count_with_neighborhood.neighborhood, prefix = 'Nbr')
frames = [trip_count_with_neighborhood, neighborhood_dummies]
trip_count_all_dummies = pd.concat(frames, axis=1)
print trip_count_all_dummies.shape
trip_count_all_dummies.head()
The following method takes in as arguments the input, output, and the regression model to be used. It returns the R-squared value, mean squared error, and also a histogram of the residuals. The method is called subsequently using various predictor variables for building a lasso, ridge, or elasticnet regression model.
def get_linear_model_metrics(X, y, algo):
# get the pvalue of X given y. Ignore f-stat for now.
pvals = feature_selection.f_regression(X, y)[1]
# start with an empty linear regression object
# .fit() runs the linear regression function on X and y
algo.fit(X,y)
residuals = (y-algo.predict(X)).values
print residuals
print 'Mean Squared Error:',(sum(residuals**2))/len(y)
# print the necessary values
# print 'Coefficients:', algo.coef_
# print 'y-intercept:', algo.intercept_
print 'R-Squared:', algo.score(X,y)
plt.figure()
plt.hist(residuals, bins=np.ceil(np.sqrt(len(y))))
# keep the model
return algo
For the scikit learn linear model, I exclude all the columns except the neighborhood and pickup hour dummy variables.
Ridge Regression gives us the best fit.
X = trip_count_all_dummies.drop(['full_date','pickup_day','pickup_hour','no_of_trips','Borough','longitude','latitude','neighborhood','month_day'], axis=1)
y = trip_count_all_dummies['no_of_trips']
estimators = [
linear_model.Lasso(),
linear_model.Ridge(),
linear_model.ElasticNet(),
]
for est in estimators:
print est
get_linear_model_metrics(X, y, est)
print
I extract the weather data from the website www.wunderground.com. Subsequently, I join the weather data with the trip count data and use the weather parameters as additional predictors.
weather_df = pd.read_csv('https://www.wunderground.com/history/airport/KNYC/2013/1/1/MonthlyHistory.html?&reqdb.zip=&reqdb.magic=&reqdb.wmo=&MR=1&format=1')
weather_df.head()
weather_df.columns
weather_df.rename(columns={'Max TemperatureF': 'Max_TemperatureF', 'Mean TemperatureF': 'Mean_TemperatureF',
'Min TemperatureF':'Min_TemperatureF', 'Max Dew PointF':'Max_Dew_PointF',
'MeanDew PointF':'MeanDew_PointF', 'Min DewpointF':'Min_DewpointF','Max Humidity':'Max_Humidity',
' Mean Humidity':'Mean_Humidity',' Min Humidity':'Min_Humidity',
' Max Sea Level PressureIn':'Max_Sea_Level_PressureIn',' Mean Sea Level PressureIn':'Mean_Sea_Level_PressureIn',
' Min Sea Level PressureIn':'Min_Sea_Level_PressureIn',' Max VisibilityMiles':'Max_VisibilityMiles',
' Mean VisibilityMiles':'Mean_VisibilityMiles',' Min VisibilityMiles':'Min_VisibilityMiles',
' Max Wind SpeedMPH':'Max_Wind_SpeedMPH',' Mean Wind SpeedMPH':'Mean_Wind_SpeedMPH',
' Max Gust SpeedMPH':'Max_Gust_SpeedMPH','PrecipitationIn':'PrecipitationIn', ' CloudCover':'CloudCover', ' Events':'Events',
' WindDirDegrees<br />':'WindDirDegrees'}, inplace=True)
weather_df.columns
weather_df['WindDirDegrees'] = weather_df['WindDirDegrees'].apply(lambda x: x.split('<')[0]) #Removing the </br> from the 'WinDirDegrees' column
weather_df.head()
print weather_df['Events'].unique()
print weather_df['WindDirDegrees'].unique()
print weather_df['PrecipitationIn'].unique()
weather_df.loc[(weather_df['PrecipitationIn'] == 'T'),'PrecipitationIn'] = 0 #Replace precipitation 'T' with 0
weather_df['PrecipitationIn'] = weather_df['PrecipitationIn'].astype(float)
weather_df['date'] = weather_df['EST'].apply(lambda x: time.strptime(x, "%Y-%m-%d").tm_mday)
weather_df.head()
trip_count_all_dummies.head()
I now combine the trip data with the weather data using the index 'date' which corresponds to 'month_day' in the trip_count_all_dummies dataframe.
trip_and_weather_df = trip_count_all_dummies.set_index('month_day').join(weather_df.set_index('date'))
trip_and_weather_df['month_day'] = trip_and_weather_df.index.values
trip_and_weather_df.reset_index(drop=True, inplace=True)
trip_and_weather_df.head()
trip_and_weather_df.columns.values
sns.heatmap(trip_and_weather_df[['Max_TemperatureF', 'Mean_TemperatureF', 'Min_TemperatureF',\
'Max_Dew_PointF', 'MeanDew_PointF', 'Min_DewpointF', 'Max_Humidity',\
'Mean_Humidity', 'Min_Humidity', 'Max_Sea_Level_PressureIn',\
'Mean_Sea_Level_PressureIn', 'Min_Sea_Level_PressureIn',\
'Max_VisibilityMiles', 'Mean_VisibilityMiles',\
'Min_VisibilityMiles', 'Max_Wind_SpeedMPH', 'Mean_Wind_SpeedMPH',\
'Max_Gust_SpeedMPH', 'PrecipitationIn', 'CloudCover',\
'WindDirDegrees']].corr())
The heatmap of the weather data tells us that attributes such as max and min temperatures, max and min humidity, max and min sea level pressure are autocorrelated. Thus, as predictors, I will choose only the mean values for these weather attributes. All the max and min values are therefore dropped while performing linear model fits in scikit-learn and statsmodels.
X = trip_and_weather_df.drop(['full_date','pickup_day','pickup_hour','no_of_trips','Borough','longitude','latitude',\
'neighborhood','month_day','Events','Max_TemperatureF','Min_TemperatureF','Max_Dew_PointF',\
'Min_DewpointF','Max_Humidity','Min_Humidity', 'Max_Sea_Level_PressureIn','Min_Sea_Level_PressureIn',\
'Max_VisibilityMiles','Min_VisibilityMiles','Max_Wind_SpeedMPH','EST','month_day','Events'], axis=1)
y = trip_and_weather_df['no_of_trips']
estimators = [
linear_model.Lasso(),
linear_model.Ridge(),
linear_model.ElasticNet(),
]
for est in estimators:
print est
get_linear_model_metrics(X, y, est)
print
Once again, Ridge Regression gives us the best fit.
I also perform another linear model fit with statsmodels which gives me simialr results as the Ridge regression fit on scikit-learn.
lm = smf.ols(formula='no_of_trips ~ neighborhood + pickup_day + hour_0 + hour_1 + hour_2 + hour_3 + hour_4 + hour_5 + \
hour_6 + hour_7 + hour_8 + hour_9 + hour_10 + hour_11 + hour_12 + hour_13 + hour_14 + hour_15 + \
hour_16 + hour_17 + hour_18 + hour_19 + hour_20 + hour_21 + hour_22 + hour_23 + \
Mean_TemperatureF + MeanDew_PointF + Mean_Humidity + Mean_Sea_Level_PressureIn + Mean_VisibilityMiles + \
Mean_Wind_SpeedMPH + Max_Gust_SpeedMPH + PrecipitationIn + CloudCover', data=trip_and_weather_df).fit()
lm.summary()
Finally, I perform a decision treee regression with cross-validation using a train-test split on the dataset. I do a grid search to find the optimum max_depth for the tree. The min_samples_split has been assigned to a default 2.
X = trip_and_weather_df.drop(['full_date','pickup_day','pickup_hour','no_of_trips','Borough','longitude','latitude',\
'neighborhood','month_day','Events','Max_TemperatureF','Min_TemperatureF','Max_Dew_PointF',\
'Min_DewpointF','Max_Humidity','Min_Humidity', 'Max_Sea_Level_PressureIn','Min_Sea_Level_PressureIn',\
'Max_VisibilityMiles','Min_VisibilityMiles','Max_Wind_SpeedMPH','EST','month_day','Events'], axis=1)
y = trip_and_weather_df['no_of_trips']
train_data,test_data,train_op,test_op = train_test_split(X,y,test_size=0.2)
print 'Train Data size: %s' % (train_data.shape,)
print 'Test Data size: %s' % (test_data.shape,)
for max_depth in range(1, 26):
regressor = DecisionTreeRegressor(criterion='mse', max_depth=max_depth, max_features=None,
max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False, random_state=None,
splitter='best')
print(regressor)
regressor.fit(train_data,train_op)
print 'Train Data regressor score: %f' %(regressor.score(train_data,train_op))
op = regressor.predict(test_data)
print 'Mean Squared Error: %f' %sklearn.metrics.mean_squared_error(test_op,op)
print 'Median Absolute Error: %f' %sklearn.metrics.median_absolute_error(test_op,op)
print 'Test set R-squared: %f' %sklearn.metrics.r2_score(test_op,op)
A max_depth of 20 gives us the best result on the test dataset.
The following blocks of code contain analysis of fare amount and tip amount for all taxi trips i.e. the entire dataset.
First I look at the distribution of mean tip amount received by drivers.
%%time
trip_full_df.groupby('medallion')['tip_amount'].mean().hist(bins=100)
One possible reason the graph shows so many drivers getting low tips is that the data gets skewed with a number of records contain tip_amount = 0. If I remove these records and plot the histogram, the resulting histogram will be a more accurate portrayal of the average tip amount received by drivers.
%%time
trip_full_df[trip_full_df.tip_amount != 0].groupby('medallion')['tip_amount'].mean().hist(bins=100)
I was interested to look at the relationship between tip amount and parameters such as trip distance, fare amount. I also looked at the relationship between fare amount and trip distance.
For this analysis, I only looked at the first 500,000 records and not the entire dataset.
trip_full_df['trip_distance'] = trip_full_df['trip_distance'].astype(float)
trip_full_df['fare_amount'] = trip_full_df['fare_amount'].astype(float)
trip_full_df['tip_amount'] = trip_full_df['tip_amount'].astype(float)
trip_full_df['total_amount'] = trip_full_df['total_amount'].astype(float)
fare_data_head = trip_full_df.head(500000)
sns.lmplot('trip_distance', 'fare_amount', fare_data_head[fare_data_head.trip_distance > 0],size = 10)
sns.lmplot('trip_distance', 'tip_amount', fare_data_head[fare_data_head.trip_distance > 0],size = 10)
sns.lmplot('fare_amount', 'tip_amount', fare_data_head[fare_data_head.trip_distance > 0],size = 10)
The relationship between tip amount and fare amount shows a a linear characteristic with several specific slopes (probably at 10%, 15%, 20% and so on).
Fare amount as a function of trip distance is also linear as expected. However, it's interesting to note that there is a constant fare amount of $52 for several trips, which is irrespective of the trip distance. This turns out to be the constant fare charged for a trip to the LaGuardia Airport.
I then looked at the distribution of the median tip fraction received by individual drivers, by grouping by the medallion numbers.
Next, I look at the overall distribution of the tip fraction. Only records for which tip fraction lies between 0 and 0.5 are looked at.
%%time
trip_full_df['tip_fraction'] = trip_full_df['tip_amount']/trip_full_df['fare_amount']
trip_full_df.groupby('medallion')['tip_fraction'].median().hist(bins=250)
%%time
trip_full_df[(trip_full_df['tip_fraction'] > 0) & (trip_full_df['tip_fraction'] < 0.5)]['tip_fraction'].hist(bins=250)
There is a distinct spike at 0.2 or 20% tip. Also, a tip fraction of greater than 0.2 is more common than less.
Finally, I wrap it up by looking at the distribution of log of tips, since tip amounts in general was seen to be tail-heavy.
%%time
trip_full_df['tip_amount_log'] = trip_full_df['tip_amount'].apply(np.log10)
trip_full_df[trip_full_df['tip_amount_log'] > 0]['tip_amount_log'].hist(bins=100)
from bokeh.plotting import figure, output_notebook, show
import datashader as ds
I load only the latitude and longitude data of the overall NYC Taxi dataset into a new dataframe. I first plot a histogram of the latitudes and longitudes of the pickup locations to see where most trips take place. I then use Datashader library to visualize regions where, as an aggregate, the number of pickups are more than the number of dropoffs and vice versa.
%time
df = pd.read_csv('../nyc_taxi_files/trip_data_files/trip_data_1.csv', usecols= \
['pickup_longitude', 'pickup_latitude', 'pickup_datetime', 'dropoff_datetime', 'dropoff_longitude','dropoff_latitude', 'passenger_count'])
df.tail()
df['pickup_hour']=df['pickup_datetime'].apply(lambda x: time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_hour)
df['dropoff_hour']=df['dropoff_datetime'].apply(lambda x: time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_hour)
df.tail()
df.describe()
print df[(df['pickup_longitude']>-74.1)&(df['pickup_longitude']<-73.8)]['pickup_longitude'].median()
df[(df['pickup_longitude']>-74.1)&(df['pickup_longitude']<-73.8)]['pickup_longitude'].hist(bins=50)
print df[(df['pickup_latitude']>40.6)&(df['pickup_latitude']<40.9)]['pickup_latitude'].median()
df[(df['pickup_latitude']>40.5)&(df['pickup_latitude']<40.9)]['pickup_latitude'].hist(bins=50)
output_notebook()
x_range=(-74.05,-73.8)
y_range=(40.7,40.8)
def base_plot(tools='pan,wheel_zoom,reset',plot_width=900, plot_height=600, **plot_args):
p = figure(tools=tools, plot_width=plot_width, plot_height=plot_height,
x_range=x_range, y_range=y_range, outline_line_color=None,**plot_args)
p.axis.visible = False
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None
return p
options = dict(line_color=None, fill_color='blue', size=5)
from IPython.core.display import HTML, display
display(HTML("<style>.container { width:90% !important; }</style>"))
import datashader as ds
from datashader import transfer_functions as tf
from datashader.colors import Greys9
Greys9_r = list(reversed(Greys9))[:-2]
def draw_pickup_dropoff_maps(hour_start, hour_end):
'''
Creates a datashader image of densities of pickup and dropoffs.
Args:
hour_start (int): Varies between 0 to 23. Taxi trips starting this hour will be considered.
hour_end (int): Varies between 1 to 24. Taxi trips ending at this hour will be considered.
Returns:
img (stack image): datashader image interpolating total number of pickups and dropoffs.
'''
cvs = ds.Canvas(plot_width=900, plot_height=600, x_range=x_range, y_range=y_range)
# Pickup map
pickup_df = df[(df['pickup_hour'] >= hour_start) & (df['pickup_hour']<hour_end)]
agg1 = cvs.points(pickup_df, 'pickup_longitude', 'pickup_latitude')
# Dropoff map
dropoff_df = df[(df['dropoff_hour'] >= hour_start) & (df['dropoff_hour']<hour_end)]
agg2 = cvs.points(dropoff_df, 'dropoff_longitude', 'dropoff_latitude')
img1=tf.interpolate(agg1.where(agg1>agg2), cmap=["lightblue", 'blue'], how='eq_hist')
img2=tf.interpolate(agg2.where(agg2>agg1), cmap=["lightpink", 'red'], how='eq_hist')
img = tf.stack(img1,img2)
# return tf.dynspread(img, threshold=0.1, max_px=4)
return img
Using the method above, I proceed to visualize the nature of taxi trips as a day passes by. Regions where pickups are greater are marked as blue while regions where dropoffs are greater are marked red.
Shown below are 12 visualizations of pickup and dropoff trends for 2 hour time periods. I start with the trips between 12 to 2 AM and end with trips between 10 PM to 12 AM.
img=draw_pickup_dropoff_maps(0, 2)
img
img=draw_pickup_dropoff_maps(2, 4)
img
img=draw_pickup_dropoff_maps(4, 6)
img
img=draw_pickup_dropoff_maps(6, 8)
img
img=draw_pickup_dropoff_maps(8, 10)
img
img=draw_pickup_dropoff_maps(10, 12)
img
img=draw_pickup_dropoff_maps(12, 14)
img
img=draw_pickup_dropoff_maps(14, 16)
img
img=draw_pickup_dropoff_maps(16, 18)
img
img=draw_pickup_dropoff_maps(18, 20)
img
img=draw_pickup_dropoff_maps(20, 22)
img
img=draw_pickup_dropoff_maps(22, 24)
img